I. Preliminaries

Loading libraries

library("tidyverse")
library("tibble")
library("msigdbr")
library("ggplot2")
library("TCGAbiolinks")
library("RNAseqQC")
library("DESeq2")
library("ensembldb")
library("purrr")
library("magrittr")
library("vsn")
library("matrixStats")
library("dplyr")
library("grex")
library("survminer")
library("survival")

II. Downloading the TCGA gene expression data

Create a function for downloading TCGA gene expression data.

For more detailed documentation, refer to 2. Differential Gene Expression Analysis - TCGA.Rmd.

GDC_DIR = "../data/public/GDCdata"

query_and_filter_samples <- function(project) {
  query_tumor <- GDCquery(
    project = project,
    data.category = "Transcriptome Profiling",
    data.type = "Gene Expression Quantification",
    experimental.strategy = "RNA-Seq",
    workflow.type = "STAR - Counts",
    access = "open",
    sample.type = "Primary Tumor"
  )
  tumor <- getResults(query_tumor)

  query_normal <- GDCquery(
    project = project,
    data.category = "Transcriptome Profiling",
    data.type = "Gene Expression Quantification",
    experimental.strategy = "RNA-Seq",
    workflow.type = "STAR - Counts",
    access = "open",
    sample.type = "Solid Tissue Normal"
  )
  normal <- getResults(query_normal)

  submitter_ids <- inner_join(tumor, normal, by = "cases.submitter_id") %>%
    dplyr::select(cases.submitter_id)
  tumor <- tumor %>%
    dplyr::filter(cases.submitter_id %in% submitter_ids$cases.submitter_id)
  normal <- normal %>%
    dplyr::filter(cases.submitter_id %in% submitter_ids$cases.submitter_id)

  samples <- rbind(tumor, normal)
  unique(samples$sample_type)

  query_project <- GDCquery(
    project = project,
    data.category = "Transcriptome Profiling",
    data.type = "Gene Expression Quantification",
    experimental.strategy = "RNA-Seq",
    workflow.type = "STAR - Counts",
    access = "open",
    sample.type = c("Solid Tissue Normal", "Primary Tumor"),
    barcode = as.list(samples$sample.submitter_id)
  )

  # If this is your first time running this notebook (i.e., you have not yet downloaded the results of the query in the previous block),
  # uncomment the code block below

  # GDCdownload(
  #   query_coad,
  #   directory = GDC_DIR
  # )

  return(list(samples = samples, query_project = query_project))
}

Download the TCGA gene expression data for colorectal cancer (TCGA-COAD).

projects <- c("TCGA-COAD")

with_results_projects <- c()

samples <- list()
project_data <- list()

for (project in projects) {
  result <- tryCatch(
    {
      result <- query_and_filter_samples(project)
      samples[[project]] <- result$samples
      project_data[[project]] <- result$query_project

      with_results_projects <- c(with_results_projects, project)
    },
    error = function(e) {

    }
  )
}

Running the code block above should generate and populate a directory named GDCdata.

III. Data preprocessing

Construct the RNA-seq count matrix for each cancer type.

tcga_data <- list()
tcga_matrix <- list()

projects <- with_results_projects
for (project in projects) {
  tcga_data[[project]] <- GDCprepare(
    project_data[[project]], 
    directory = GDC_DIR,
    summarizedExperiment = TRUE
  )
}
for (project in projects) {
  count_matrix <- assay(tcga_data[[project]], "unstranded")

  # Remove duplicate entries
  count_matrix_df <- data.frame(count_matrix)
  count_matrix_df <- count_matrix_df[!duplicated(count_matrix_df), ]
  count_matrix <- data.matrix(count_matrix_df)
  rownames(count_matrix) <- cleanid(rownames(count_matrix))
  count_matrix <- count_matrix[!(duplicated(rownames(count_matrix)) | duplicated(rownames(count_matrix), fromLast = TRUE)), ]

  tcga_matrix[[project]] <- count_matrix
}

Format the samples table so that it can be fed as input to DESeq2.

for (project in projects) {
  rownames(samples[[project]]) <- samples[[project]]$cases
  samples[[project]] <- samples[[project]] %>%
    dplyr::select(case = "cases.submitter_id", type = "sample_type")
  samples[[project]]$type <- str_replace(samples[[project]]$type, "Solid Tissue Normal", "normal")
  samples[[project]]$type <- str_replace(samples[[project]]$type, "Primary Tumor", "tumor")
}

DESeq2 requires the row names of samples should be identical to the column names of count_matrix.

for (project in projects) {
  colnames(tcga_matrix[[project]]) <- gsub(x = colnames(tcga_matrix[[project]]), pattern = "\\.", replacement = "-")
  tcga_matrix[[project]] <- tcga_matrix[[project]][, rownames(samples[[project]])]

  # Sanity check
  print(all(colnames(tcga_matrix[[project]]) == rownames(samples[[project]])))
}

IV. Differential gene expression analysis

For more detailed documentation on obtaining the gene set, refer to 7. Differential Gene Expression Analysis - TCGA - Pan-cancer - Unique Genes.Rmd.

RCDdb <- "../data/public/rcd-gene-list/unique-genes/necroptosis-ferroptosis-pyroptosis/"

Write utility functions for filtering the gene sets, performing differential gene expression analysis, plotting the results, and performing variance-stabilizing transformation.

filter_gene_set_and_perform_dgea <- function(genes) {
  tcga_rcd <- list()

  for (project in projects) {
    rownames(genes) <- genes$gene_id
    tcga_rcd[[project]] <- tcga_matrix[[project]][rownames(tcga_matrix[[project]]) %in% genes$gene_id, ]
    tcga_rcd[[project]] <- tcga_rcd[[project]][, rownames(samples[[project]])]
  }

  dds_rcd <- list()
  res_rcd <- list()

  for (project in projects) {
    print(project)
    print("=============")
    dds <- DESeqDataSetFromMatrix(
      countData = tcga_rcd[[project]],
      colData = samples[[project]],
      design = ~type
    )
    dds <- filter_genes(dds, min_count = 10)
    dds$type <- relevel(dds$type, ref = "normal")
    dds_rcd[[project]] <- DESeq(dds)
    res_rcd[[project]] <- results(dds_rcd[[project]])
  }

  deseq.bbl.data <- list()

  for (project in projects) {
    deseq.results <- res_rcd[[project]]
    deseq.bbl.data[[project]] <- data.frame(
      row.names = rownames(deseq.results),
      baseMean = deseq.results$baseMean,
      log2FoldChange = deseq.results$log2FoldChange,
      lfcSE = deseq.results$lfcSE,
      stat = deseq.results$stat,
      pvalue = deseq.results$pvalue,
      padj = deseq.results$padj,
      cancer_type = project,
      gene_symbol = genes[rownames(deseq.results), "gene"]
    )
  }

  deseq.bbl.data.combined <- bind_rows(deseq.bbl.data)
  deseq.bbl.data.combined <- dplyr::filter(deseq.bbl.data.combined, abs(log2FoldChange) >= 1.5 & padj < 0.05)

  return(deseq.bbl.data.combined)
}
plot_dgea <- function(deseq.bbl.data.combined) {
  sizes <- c("<10^-15" = 4, "10^-10" = 3, "10^-5" = 2, "0.05" = 1)

  deseq.bbl.data.combined <- deseq.bbl.data.combined %>%
    mutate(fdr_category = cut(padj,
      breaks = c(-Inf, 1e-15, 1e-10, 1e-5, 0.05),
      labels = c("<10^-15", "10^-10", "10^-5", "0.05"),
      right = FALSE
    ))

  top_genes <- deseq.bbl.data.combined %>%
    group_by(cancer_type) %>%
    mutate(rank = rank(-abs(log2FoldChange))) %>%
    dplyr::filter(rank <= 10) %>%
    ungroup()

  ggplot(top_genes, aes(y = cancer_type, x = gene_symbol, size = fdr_category, fill = log2FoldChange)) +
    geom_point(alpha = 0.5, shape = 21, color = "black") +
    scale_size_manual(values = sizes) +
    scale_fill_gradient2(low = "blue", mid = "white", high = "red", limits = c(min(deseq.bbl.data.combined$log2FoldChange), max(deseq.bbl.data.combined$log2FoldChange))) +
    theme_minimal() +
    theme(
      axis.text.x = element_text(size = 9, angle = 90, hjust = 1)
    ) +
    theme(legend.position = "bottom") +
    theme(legend.position = "bottom") +
    labs(size = "Adjusted p-value", fill = "log2 FC", y = "Cancer type", x = "Gene")
}
perform_vsd <- function(genes) {
  tcga_rcd <- list()

  for (project in projects) {
    rownames(genes) <- genes$gene_id
    tcga_rcd[[project]] <- tcga_matrix[[project]][rownames(tcga_matrix[[project]]) %in% genes$gene_id, ]
    tcga_rcd[[project]] <- tcga_rcd[[project]][, rownames(samples[[project]])]
  }

  vsd_rcd <- list()

  for (project in projects) {
    print(project)
    print("=============")
    dds <- DESeqDataSetFromMatrix(
      countData = tcga_rcd[[project]],
      colData = samples[[project]],
      design = ~type
    )
    dds <- filter_genes(dds, min_count = 10)

    # Perform variance stabilization
    dds <- estimateSizeFactors(dds)
    nsub <- sum(rowMeans(counts(dds, normalized = TRUE)) > 10)
    vsd <- vst(dds, nsub = nsub)
    vsd_rcd[[project]] <- assay(vsd)
  }

  return(vsd_rcd)
}

Pyroptosis

Fetch the gene set of interest.

genes <- read.csv(paste0(RCDdb, "Pyroptosis.csv"))
print(genes)
genes$gene_id <- cleanid(genes$gene_id)
genes <- distinct(genes, gene_id, .keep_all = TRUE)
genes <- subset(genes, gene_id != "")
genes

Filter the genes to include only those in the gene set of interest, and then perform differential gene expression analysis.

deseq.bbl.data.combined <- filter_gene_set_and_perform_dgea(genes)
[1] "TCGA-COAD"
[1] "============="
Warning: some variables in design formula are characters, converting to factorsestimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 3 genes
-- DESeq argument 'minReplicatesForReplace' = 7 
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
deseq.bbl.data.combined

Plot the results.

plot_dgea(deseq.bbl.data.combined)

Perform variance-stabilizing transformation for further downstream analysis (i.e., for survival analysis).

vsd <- perform_vsd(genes)
[1] "TCGA-COAD"
[1] "============="

V. Downloading the clinical data

Download clinical data from TCGA, and perform some preprocessing: - The deceased column should be FALSE if the patient is alive and TRUE otherwise - The overall_survival column should reflect the follow-up time if the patient is alive and the days to death otherwise

download_clinical_data <- function(project) {
  clinical_data <- GDCquery_clinic(project)
  clinical_data$deceased <- ifelse(clinical_data$vital_status == "Alive", FALSE, TRUE)
  clinical_data$overall_survival <- ifelse(clinical_data$vital_status == "Alive",
    clinical_data$days_to_last_follow_up,
    clinical_data$days_to_death
  )

  return(clinical_data)
}
tcga_clinical <- list()
for (project in projects) {
  tcga_clinical[[project]] <- download_clinical_data(project)
}

VI. Performing survival analysis

Write utility functions for performing survival analysis.

construct_gene_df <- function(gene_of_interest, project) {
  gene_df <- vsd[[project]] %>%
    as.data.frame() %>%
    rownames_to_column(var = "gene_id") %>%
    gather(key = "case_id", value = "counts", -gene_id) %>%
    left_join(., genes, by = "gene_id") %>%
    dplyr::filter(gene == gene_of_interest) %>%
    dplyr::filter(case_id %in% rownames(samples[[project]] %>% dplyr::filter(type == "tumor")))

  q1 <- quantile(gene_df$counts, probs = 0.25)
  q3 <- quantile(gene_df$counts, probs = 0.75)
  gene_df$strata <- ifelse(gene_df$counts >= q3, "HIGH", ifelse(gene_df$counts <= q1, "LOW", "MIDDLE"))
  gene_df <- gene_df %>% dplyr::filter(strata %in% c("LOW", "HIGH"))
  gene_df$case_id <- paste0(sapply(strsplit(as.character(gene_df$case_id), "-"), `[`, 1), '-',
                          sapply(strsplit(as.character(gene_df$case_id), "-"), `[`, 2), '-', 
                          sapply(strsplit(as.character(gene_df$case_id), "-"), `[`, 3))
  gene_df <- merge(gene_df, tcga_clinical[[project]], by.x = "case_id", by.y = "submitter_id")
  
  return(gene_df)
}
compute_surival_fit <- function(gene_df) {
  return (survfit(Surv(overall_survival, deceased) ~ strata, data = gene_df))
}
compute_cox <- function(gene_df) {
  return (coxph(Surv(overall_survival, deceased) ~ strata, data=gene_df))
}
plot_survival <- function(fit) {
  return(ggsurvplot(fit,
    data = gene_df,
    pval = T,
    risk.table = T,
    risk.table.height = 0.3
  ))
}
compute_survival_diff <- function(gene_df) {
  return(survdiff(Surv(overall_survival, deceased) ~ strata, data = gene_df))
}

Perform survival analysis by testing for the difference in the Kaplan-Meier curves using the G-rho family of Harrington and Fleming tests: https://rdrr.io/cran/survival/man/survdiff.html

Our genes of interest are GSDMD (the primary executor of pyroptosis) and the differentially expressed genes.

significant_projects <- c()
significant_genes <- c()

ctr <- 1
for (project in projects) {
  for (gene in c("GSDMD", genes$gene)) {
    cat(project, gene, "\n\n")
    error <- tryCatch (
      {
        gene_df <- construct_gene_df(gene, project)
      },
      error = function(e) {
        cat("\n\n============================\n\n")
        e
      }
    )
    
    if(inherits(error, "error")) next

    if (nrow(gene_df) > 0) {
      fit <- compute_surival_fit(gene_df)
      tryCatch (
        {
          survival <- compute_survival_diff(gene_df)
          cox <- compute_cox(gene_df)
          print(ctr)
          ctr <- ctr + 1
          print(survival)
          cat("\n")
          print(cox)
          print(plot_survival(fit))
          if (pchisq(survival$chisq, length(survival$n)-1, lower.tail = FALSE) < 0.05) {
            significant_projects <- c(significant_projects, project)
            significant_genes <- c(significant_genes, gene)
          }
        },
        error = function(e) {
        }
      )
      
    }
    
    cat("\n\n============================\n\n")
  }
}
TCGA-COAD GSDMD 
Warning: Ran out of iterations and did not converge
[1] 1
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=3, 21 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 2        2    2.667     0.167         2
strata=LOW  1        1    0.333     1.333         2

 Chisq= 2  on 1 degrees of freedom, p= 0.2 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

               coef exp(coef)  se(coef) z p
strataLOW 2.215e+01 4.171e+09 4.566e+04 0 1

Likelihood ratio test=2.2  on 1 df, p=0.1383
n= 3, number of events= 3 
   (21 observations deleted due to missingness)


============================

TCGA-COAD CHMP7 

[1] 2
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=4, 20 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 3        3     2.92   0.00238    0.0105
strata=LOW  1        1     1.08   0.00641    0.0105

 Chisq= 0  on 1 degrees of freedom, p= 0.9 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

             coef exp(coef) se(coef)      z     p
strataLOW -0.1285    0.8794   1.2535 -0.103 0.918

Likelihood ratio test=0.01  on 1 df, p=0.9178
n= 4, number of events= 4 
   (20 observations deleted due to missingness)


============================

TCGA-COAD GSDMC 

[1] 3
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=8, 16 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 4        4     3.91   0.00198   0.00472
strata=LOW  4        4     4.09   0.00190   0.00472

 Chisq= 0  on 1 degrees of freedom, p= 0.9 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

              coef exp(coef) se(coef)      z     p
strataLOW -0.05336   0.94804  0.77685 -0.069 0.945

Likelihood ratio test=0  on 1 df, p=0.9453
n= 8, number of events= 8 
   (16 observations deleted due to missingness)


============================

TCGA-COAD ELANE 

[1] 4
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=6, 18 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 3        3     1.98     0.521     0.899
strata=LOW  3        3     4.02     0.257     0.899

 Chisq= 0.9  on 1 degrees of freedom, p= 0.3 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

             coef exp(coef) se(coef)      z     p
strataLOW -0.8606    0.4229   0.9325 -0.923 0.356

Likelihood ratio test=0.87  on 1 df, p=0.3496
n= 6, number of events= 6 
   (18 observations deleted due to missingness)


============================

TCGA-COAD IRF1 

[1] 5
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=6, 18 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 3        3     4.52     0.509      2.56
strata=LOW  3        3     1.48     1.551      2.56

 Chisq= 2.6  on 1 degrees of freedom, p= 0.1 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

           coef exp(coef) se(coef)    z    p
strataLOW 1.688     5.408    1.172 1.44 0.15

Likelihood ratio test=2.47  on 1 df, p=0.1159
n= 6, number of events= 6 
   (18 observations deleted due to missingness)


============================

TCGA-COAD CYCS 

[1] 6
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=5, 19 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 2        2     3.07     0.371      1.18
strata=LOW  3        3     1.93     0.589      1.18

 Chisq= 1.2  on 1 degrees of freedom, p= 0.3 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

           coef exp(coef) se(coef)    z     p
strataLOW 1.208     3.348    1.173 1.03 0.303

Likelihood ratio test=1.23  on 1 df, p=0.2675
n= 5, number of events= 5 
   (19 observations deleted due to missingness)


============================

TCGA-COAD GSDMA 

[1] 7
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=6, 18 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 4        4     4.68    0.0997     0.536
strata=LOW  2        2     1.32    0.3546     0.536

 Chisq= 0.5  on 1 degrees of freedom, p= 0.5 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

            coef exp(coef) se(coef)     z     p
strataLOW 0.7222    2.0589   1.0075 0.717 0.474

Likelihood ratio test=0.5  on 1 df, p=0.478
n= 6, number of events= 6 
   (18 observations deleted due to missingness)


============================

TCGA-COAD CASP4 

[1] 8
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=8, 16 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 5        5     5.71     0.089     0.369
strata=LOW  3        3     2.29     0.222     0.369

 Chisq= 0.4  on 1 degrees of freedom, p= 0.5 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

            coef exp(coef) se(coef)     z     p
strataLOW 0.4981    1.6456   0.8281 0.601 0.548

Likelihood ratio test=0.36  on 1 df, p=0.5493
n= 8, number of events= 8 
   (16 observations deleted due to missingness)


============================

TCGA-COAD BAK1 

[1] 9
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=5, 19 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 3        3     2.52    0.0928      0.26
strata=LOW  2        2     2.48    0.0941      0.26

 Chisq= 0.3  on 1 degrees of freedom, p= 0.6 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

             coef exp(coef) se(coef)      z     p
strataLOW -0.5892    0.5548   1.1718 -0.503 0.615

Likelihood ratio test=0.27  on 1 df, p=0.6001
n= 5, number of events= 5 
   (19 observations deleted due to missingness)


============================

TCGA-COAD NOD1 

[1] 10
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=7, 17 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 5        5      4.9   0.00214    0.0079
strata=LOW  2        2      2.1   0.00499    0.0079

 Chisq= 0  on 1 degrees of freedom, p= 0.9 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

             coef exp(coef) se(coef)      z     p
strataLOW -0.0781    0.9249   0.8787 -0.089 0.929

Likelihood ratio test=0.01  on 1 df, p=0.9289
n= 7, number of events= 7 
   (17 observations deleted due to missingness)


============================

TCGA-COAD NLRP7 

[1] 11
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=6, 18 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 2        2    0.783     1.890       2.6
strata=LOW  4        4    5.217     0.284       2.6

 Chisq= 2.6  on 1 degrees of freedom, p= 0.1 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

            coef exp(coef) se(coef)     z     p
strataLOW -1.766     0.171    1.235 -1.43 0.153

Likelihood ratio test=2.2  on 1 df, p=0.1382
n= 6, number of events= 6 
   (18 observations deleted due to missingness)


============================

TCGA-COAD CASP3 

[1] 12
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=4, 20 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 2        2     2.67     0.167     0.615
strata=LOW  2        2     1.33     0.333     0.615

 Chisq= 0.6  on 1 degrees of freedom, p= 0.4 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

            coef exp(coef) se(coef)     z     p
strataLOW 0.9406    2.5616   1.2403 0.758 0.448

Likelihood ratio test=0.62  on 1 df, p=0.4325
n= 4, number of events= 4 
   (20 observations deleted due to missingness)


============================

TCGA-COAD GSDMB 

[1] 13
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=5, 19 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 3        3     3.27    0.0218    0.0739
strata=LOW  2        2     1.73    0.0410    0.0739

 Chisq= 0.1  on 1 degrees of freedom, p= 0.8 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

            coef exp(coef) se(coef)     z     p
strataLOW 0.2739    1.3151   1.0107 0.271 0.786

Likelihood ratio test=0.07  on 1 df, p=0.7866
n= 5, number of events= 5 
   (19 observations deleted due to missingness)


============================

TCGA-COAD GZMB 

[1] 14
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=5, 19 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 3        3    4.017     0.257      1.59
strata=LOW  2        2    0.983     1.051      1.59

 Chisq= 1.6  on 1 degrees of freedom, p= 0.2 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

           coef exp(coef) se(coef)     z     p
strataLOW 1.439     4.215    1.236 1.164 0.244

Likelihood ratio test=1.46  on 1 df, p=0.2262
n= 5, number of events= 5 
   (19 observations deleted due to missingness)


============================

TCGA-COAD GSDME 

[1] 15
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=4, 20 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 1        1     0.25      2.25         3
strata=LOW  3        3     3.75      0.15         3

 Chisq= 3  on 1 degrees of freedom, p= 0.08 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

                coef  exp(coef)   se(coef)      z p
strataLOW -2.208e+01  2.562e-10  3.607e+04 -0.001 1

Likelihood ratio test=2.77  on 1 df, p=0.09589
n= 4, number of events= 4 
   (20 observations deleted due to missingness)


============================

TCGA-COAD CHMP3 

[1] 16
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=4, 20 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 2        2    0.833      1.63      2.88
strata=LOW  2        2    3.167      0.43      2.88

 Chisq= 2.9  on 1 degrees of freedom, p= 0.09 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

                coef  exp(coef)   se(coef)      z     p
strataLOW -2.168e+01  3.848e-10  2.943e+04 -0.001 0.999

Likelihood ratio test=3.58  on 1 df, p=0.05836
n= 4, number of events= 4 
   (20 observations deleted due to missingness)


============================

TCGA-COAD DPP9 

[1] 17
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=3, 21 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 2        2    2.167    0.0128    0.0588
strata=LOW  1        1    0.833    0.0333    0.0588

 Chisq= 0.1  on 1 degrees of freedom, p= 0.8 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

            coef exp(coef) se(coef)     z     p
strataLOW 0.3466    1.4142   1.4355 0.241 0.809

Likelihood ratio test=0.06  on 1 df, p=0.8096
n= 3, number of events= 3 
   (21 observations deleted due to missingness)


============================

TCGA-COAD NOD2 

[1] 18
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=5, 19 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 1        1     1.28    0.0626    0.0979
strata=LOW  4        4     3.72    0.0216    0.0979

 Chisq= 0.1  on 1 degrees of freedom, p= 0.8 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

            coef exp(coef) se(coef)     z     p
strataLOW 0.3695    1.4470   1.1865 0.311 0.755

Likelihood ratio test=0.1  on 1 df, p=0.7492
n= 5, number of events= 5 
   (19 observations deleted due to missingness)


============================

TCGA-COAD NLRC4 

[1] 19
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=5, 19 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 3        3    4.017     0.257      1.59
strata=LOW  2        2    0.983     1.051      1.59

 Chisq= 1.6  on 1 degrees of freedom, p= 0.2 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

           coef exp(coef) se(coef)     z     p
strataLOW 1.439     4.215    1.236 1.164 0.244

Likelihood ratio test=1.46  on 1 df, p=0.2262
n= 5, number of events= 5 
   (19 observations deleted due to missingness)


============================

TCGA-COAD GSDMD 

[1] 20
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=3, 21 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 2        2    2.667     0.167         2
strata=LOW  1        1    0.333     1.333         2

 Chisq= 2  on 1 degrees of freedom, p= 0.2 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

               coef exp(coef)  se(coef) z p
strataLOW 2.215e+01 4.171e+09 4.566e+04 0 1

Likelihood ratio test=2.2  on 1 df, p=0.1383
n= 3, number of events= 3 
   (21 observations deleted due to missingness)


============================

TCGA-COAD TIRAP 

[1] 21
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=7, 17 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 2        2      1.4    0.2547     0.359
strata=LOW  5        5      5.6    0.0638     0.359

 Chisq= 0.4  on 1 degrees of freedom, p= 0.5 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

             coef exp(coef) se(coef)      z     p
strataLOW -0.5445    0.5801   0.9202 -0.592 0.554

Likelihood ratio test=0.33  on 1 df, p=0.5629
n= 7, number of events= 7 
   (17 observations deleted due to missingness)


============================

TCGA-COAD SCAF11 

[1] 22
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=4, 20 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 1        1     2.08     0.563      1.78
strata=LOW  3        3     1.92     0.612      1.78

 Chisq= 1.8  on 1 degrees of freedom, p= 0.2 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

               coef exp(coef)  se(coef)     z     p
strataLOW 2.085e+01 1.136e+09 2.490e+04 0.001 0.999

Likelihood ratio test=2.77  on 1 df, p=0.09589
n= 4, number of events= 4 
   (20 observations deleted due to missingness)


============================

TCGA-COAD NLRP6 

[1] 23
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=7, 17 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 4        4     4.75     0.120     0.417
strata=LOW  3        3     2.25     0.254     0.417

 Chisq= 0.4  on 1 degrees of freedom, p= 0.5 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

            coef exp(coef) se(coef)     z     p
strataLOW 0.5293    1.6977   0.8287 0.639 0.523

Likelihood ratio test=0.4  on 1 df, p=0.5252
n= 7, number of events= 7 
   (17 observations deleted due to missingness)


============================

TCGA-COAD AIM2 

[1] 24
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=6, 18 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 4        4     4.38    0.0335     0.141
strata=LOW  2        2     1.62    0.0909     0.141

 Chisq= 0.1  on 1 degrees of freedom, p= 0.7 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

            coef exp(coef) se(coef)     z     p
strataLOW 0.3503    1.4194   0.9368 0.374 0.708

Likelihood ratio test=0.14  on 1 df, p=0.712
n= 6, number of events= 6 
   (18 observations deleted due to missingness)


============================

TCGA-COAD CASP6 

[1] 25
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=7, 17 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 3        3     3.45    0.0575     0.142
strata=LOW  4        4     3.55    0.0558     0.142

 Chisq= 0.1  on 1 degrees of freedom, p= 0.7 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

            coef exp(coef) se(coef)     z     p
strataLOW 0.3150    1.3703   0.8385 0.376 0.707

Likelihood ratio test=0.14  on 1 df, p=0.7076
n= 7, number of events= 7 
   (17 observations deleted due to missingness)


============================

TCGA-COAD NLRP2 

[1] 26
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=5, 19 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 2        2     1.48    0.1800     0.297
strata=LOW  3        3     3.52    0.0759     0.297

 Chisq= 0.3  on 1 degrees of freedom, p= 0.6 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

             coef exp(coef) se(coef)      z    p
strataLOW -0.5493    0.5774   1.0198 -0.539 0.59

Likelihood ratio test=0.29  on 1 df, p=0.5922
n= 5, number of events= 5 
   (19 observations deleted due to missingness)


============================

TCGA-COAD IRF2 

[1] 27
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=4, 20 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 3        3    3.417    0.0508     0.424
strata=LOW  1        1    0.583    0.2976     0.424

 Chisq= 0.4  on 1 degrees of freedom, p= 0.5 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

            coef exp(coef) se(coef)    z     p
strataLOW 0.8959    2.4495   1.4215 0.63 0.529

Likelihood ratio test=0.38  on 1 df, p=0.535
n= 4, number of events= 4 
   (20 observations deleted due to missingness)


============================

TCGA-COAD PJVK 

[1] 28
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=7, 17 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 3        3        2     0.506     0.856
strata=LOW  4        4        5     0.202     0.856

 Chisq= 0.9  on 1 degrees of freedom, p= 0.4 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

             coef exp(coef) se(coef)      z     p
strataLOW -0.8286    0.4366   0.9202 -0.901 0.368

Likelihood ratio test=0.83  on 1 df, p=0.3615
n= 7, number of events= 7 
   (17 observations deleted due to missingness)


============================

TCGA-COAD CASP5 

[1] 29
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=4, 20 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 3        3     2.92   0.00238    0.0105
strata=LOW  1        1     1.08   0.00641    0.0105

 Chisq= 0  on 1 degrees of freedom, p= 0.9 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

             coef exp(coef) se(coef)      z     p
strataLOW -0.1285    0.8794   1.2535 -0.103 0.918

Likelihood ratio test=0.01  on 1 df, p=0.9178
n= 4, number of events= 4 
   (20 observations deleted due to missingness)


============================

TCGA-COAD NLRP1 

[1] 30
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=8, 16 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 4        4      4.5    0.0546     0.144
strata=LOW  4        4      3.5    0.0700     0.144

 Chisq= 0.1  on 1 degrees of freedom, p= 0.7 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

            coef exp(coef) se(coef)     z     p
strataLOW 0.2922    1.3394   0.7736 0.378 0.706

Likelihood ratio test=0.14  on 1 df, p=0.7041
n= 8, number of events= 8 
   (16 observations deleted due to missingness)


============================

TCGA-COAD CASP9 

[1] 31
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=6, 18 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 3        3     4.27     0.376      1.67
strata=LOW  3        3     1.73     0.926      1.67

 Chisq= 1.7  on 1 degrees of freedom, p= 0.2 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

           coef exp(coef) se(coef)     z     p
strataLOW 1.394     4.033    1.164 1.198 0.231

Likelihood ratio test=1.69  on 1 df, p=0.1937
n= 6, number of events= 6 
   (18 observations deleted due to missingness)


============================

TCGA-COAD PLCG1 

[1] 32
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=5, 19 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 2        2     1.73    0.0410    0.0739
strata=LOW  3        3     3.27    0.0218    0.0739

 Chisq= 0.1  on 1 degrees of freedom, p= 0.8 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

             coef exp(coef) se(coef)      z     p
strataLOW -0.2739    0.7604   1.0107 -0.271 0.786

Likelihood ratio test=0.07  on 1 df, p=0.7866
n= 5, number of events= 5 
   (19 observations deleted due to missingness)


============================

TCGA-COAD IL18 

[1] 33
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=8, 16 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 5        5     4.63    0.0296     0.081
strata=LOW  3        3     3.37    0.0407     0.081

 Chisq= 0.1  on 1 degrees of freedom, p= 0.8 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

             coef exp(coef) se(coef)      z     p
strataLOW -0.2207    0.8020   0.7765 -0.284 0.776

Likelihood ratio test=0.08  on 1 df, p=0.7754
n= 8, number of events= 8 
   (16 observations deleted due to missingness)


============================

TCGA-COAD DPP8 

[1] 34
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=5, 19 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 1        1      0.2     3.200         4
strata=LOW  4        4      4.8     0.133         4

 Chisq= 4  on 1 degrees of freedom, p= 0.05 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

                coef  exp(coef)   se(coef)      z     p
strataLOW -2.204e+01  2.674e-10  3.057e+04 -0.001 0.999

Likelihood ratio test=3.22  on 1 df, p=0.07279
n= 5, number of events= 5 
   (19 observations deleted due to missingness)


============================

Display the results only for genes where a significant difference in survival has been reported.

significant_genes
[1] "DPP8"
num_significant_genes <- length(significant_genes)

if (num_significant_genes > 0) {
  for (i in 1 : num_significant_genes) {
    project <- significant_projects[[i]]
    gene <- significant_genes[[i]]
    
    cat(project, gene, "\n\n")
    gene_df <- construct_gene_df(gene, project)
    
    fit <- compute_surival_fit(gene_df)
    survival <- compute_survival_diff(gene_df)
    cox <- compute_cox(gene_df)
    print(survival)
    cat("\n")
    print(cox)
    print(plot_survival(fit))
    
    cat("\n\n============================\n\n")
  } 
}
TCGA-COAD DPP8 
Warning: Loglik converged before variable  1 ; coefficient may be infinite. 
Call:
survdiff(formula = Surv(overall_survival, deceased) ~ strata, 
    data = gene_df)

n=5, 19 observations deleted due to missingness.

            N Observed Expected (O-E)^2/E (O-E)^2/V
strata=HIGH 1        1      0.2     3.200         4
strata=LOW  4        4      4.8     0.133         4

 Chisq= 4  on 1 degrees of freedom, p= 0.05 

Call:
coxph(formula = Surv(overall_survival, deceased) ~ strata, data = gene_df)

                coef  exp(coef)   se(coef)      z     p
strataLOW -2.204e+01  2.674e-10  3.057e+04 -0.001 0.999

Likelihood ratio test=3.22  on 1 df, p=0.07279
n= 5, number of events= 5 
   (19 observations deleted due to missingness)


============================


  1. De La Salle University, Manila, Philippines, ↩︎

  2. De La Salle University, Manila, Philippines, ↩︎

---
title: "Survival Analysis"
subtitle: "Colorectal Cancer | Pyroptosis | Unique Genes per RCD Type | Gene Expression of Tumor Samples"
author: 
  - Mark Edward M. Gonzales^[De La Salle University, Manila, Philippines, gonzales.markedward@gmail.com]
  - Dr. Anish M.S. Shrestha^[De La Salle University, Manila, Philippines, anish.shrestha@dlsu.edu.ph]
output: html_notebook
---

## I. Preliminaries

### Loading libraries

```{r, warning=FALSE, message=FALSE}
library("tidyverse")
library("tibble")
library("msigdbr")
library("ggplot2")
library("TCGAbiolinks")
library("RNAseqQC")
library("DESeq2")
library("ensembldb")
library("purrr")
library("magrittr")
library("vsn")
library("matrixStats")
library("dplyr")
library("grex")
library("survminer")
library("survival")
```

## II. Downloading the TCGA gene expression data 

Create a function for downloading TCGA gene expression data. 

For more detailed documentation, refer to `2. Differential Gene Expression Analysis - TCGA.Rmd`.

```{r}
GDC_DIR = "../data/public/GDCdata"

query_and_filter_samples <- function(project) {
  query_tumor <- GDCquery(
    project = project,
    data.category = "Transcriptome Profiling",
    data.type = "Gene Expression Quantification",
    experimental.strategy = "RNA-Seq",
    workflow.type = "STAR - Counts",
    access = "open",
    sample.type = "Primary Tumor"
  )
  tumor <- getResults(query_tumor)

  query_normal <- GDCquery(
    project = project,
    data.category = "Transcriptome Profiling",
    data.type = "Gene Expression Quantification",
    experimental.strategy = "RNA-Seq",
    workflow.type = "STAR - Counts",
    access = "open",
    sample.type = "Solid Tissue Normal"
  )
  normal <- getResults(query_normal)

  submitter_ids <- inner_join(tumor, normal, by = "cases.submitter_id") %>%
    dplyr::select(cases.submitter_id)
  tumor <- tumor %>%
    dplyr::filter(cases.submitter_id %in% submitter_ids$cases.submitter_id)
  normal <- normal %>%
    dplyr::filter(cases.submitter_id %in% submitter_ids$cases.submitter_id)

  samples <- rbind(tumor, normal)
  unique(samples$sample_type)

  query_project <- GDCquery(
    project = project,
    data.category = "Transcriptome Profiling",
    data.type = "Gene Expression Quantification",
    experimental.strategy = "RNA-Seq",
    workflow.type = "STAR - Counts",
    access = "open",
    sample.type = c("Solid Tissue Normal", "Primary Tumor"),
    barcode = as.list(samples$sample.submitter_id)
  )

  # If this is your first time running this notebook (i.e., you have not yet downloaded the results of the query in the previous block),
  # uncomment the code block below

  # GDCdownload(
  #   query_coad,
  #   directory = GDC_DIR
  # )

  return(list(samples = samples, query_project = query_project))
}
```

Download the TCGA gene expression data for colorectal cancer (TCGA-COAD).

```{r, echo = TRUE, message = FALSE, results="hide"}
projects <- c("TCGA-COAD")

with_results_projects <- c()

samples <- list()
project_data <- list()

for (project in projects) {
  result <- tryCatch(
    {
      result <- query_and_filter_samples(project)
      samples[[project]] <- result$samples
      project_data[[project]] <- result$query_project

      with_results_projects <- c(with_results_projects, project)
    },
    error = function(e) {

    }
  )
}
```

Running the code block above should generate and populate a directory named `GDCdata`.

## III. Data preprocessing

Construct the RNA-seq count matrix for each cancer type.

```{r, echo = TRUE, message = FALSE, results="hide"}
tcga_data <- list()
tcga_matrix <- list()

projects <- with_results_projects
for (project in projects) {
  tcga_data[[project]] <- GDCprepare(
    project_data[[project]], 
    directory = GDC_DIR,
    summarizedExperiment = TRUE
  )
}
```

```{r}
for (project in projects) {
  count_matrix <- assay(tcga_data[[project]], "unstranded")

  # Remove duplicate entries
  count_matrix_df <- data.frame(count_matrix)
  count_matrix_df <- count_matrix_df[!duplicated(count_matrix_df), ]
  count_matrix <- data.matrix(count_matrix_df)
  rownames(count_matrix) <- cleanid(rownames(count_matrix))
  count_matrix <- count_matrix[!(duplicated(rownames(count_matrix)) | duplicated(rownames(count_matrix), fromLast = TRUE)), ]

  tcga_matrix[[project]] <- count_matrix
}
```
Format the `samples` table so that it can be fed as input to DESeq2.

```{r}
for (project in projects) {
  rownames(samples[[project]]) <- samples[[project]]$cases
  samples[[project]] <- samples[[project]] %>%
    dplyr::select(case = "cases.submitter_id", type = "sample_type")
  samples[[project]]$type <- str_replace(samples[[project]]$type, "Solid Tissue Normal", "normal")
  samples[[project]]$type <- str_replace(samples[[project]]$type, "Primary Tumor", "tumor")
}
```

DESeq2 requires the row names of `samples` should be identical to the column names of `count_matrix`.

```{r, echo = TRUE, results="hide"}
for (project in projects) {
  colnames(tcga_matrix[[project]]) <- gsub(x = colnames(tcga_matrix[[project]]), pattern = "\\.", replacement = "-")
  tcga_matrix[[project]] <- tcga_matrix[[project]][, rownames(samples[[project]])]

  # Sanity check
  print(all(colnames(tcga_matrix[[project]]) == rownames(samples[[project]])))
}
```

## IV. Differential gene expression analysis

For more detailed documentation on obtaining the gene set, refer to `7. Differential Gene Expression Analysis - TCGA - Pan-cancer - Unique Genes.Rmd`.

```{r}
RCDdb <- "../data/public/rcd-gene-list/unique-genes/necroptosis-ferroptosis-pyroptosis/"
```

Write utility functions for filtering the gene sets, performing differential gene expression analysis, plotting the results, and performing variance-stabilizing transformation.

```{r}
filter_gene_set_and_perform_dgea <- function(genes) {
  tcga_rcd <- list()

  for (project in projects) {
    rownames(genes) <- genes$gene_id
    tcga_rcd[[project]] <- tcga_matrix[[project]][rownames(tcga_matrix[[project]]) %in% genes$gene_id, ]
    tcga_rcd[[project]] <- tcga_rcd[[project]][, rownames(samples[[project]])]
  }

  dds_rcd <- list()
  res_rcd <- list()

  for (project in projects) {
    print(project)
    print("=============")
    dds <- DESeqDataSetFromMatrix(
      countData = tcga_rcd[[project]],
      colData = samples[[project]],
      design = ~type
    )
    dds <- filter_genes(dds, min_count = 10)
    dds$type <- relevel(dds$type, ref = "normal")
    dds_rcd[[project]] <- DESeq(dds)
    res_rcd[[project]] <- results(dds_rcd[[project]])
  }

  deseq.bbl.data <- list()

  for (project in projects) {
    deseq.results <- res_rcd[[project]]
    deseq.bbl.data[[project]] <- data.frame(
      row.names = rownames(deseq.results),
      baseMean = deseq.results$baseMean,
      log2FoldChange = deseq.results$log2FoldChange,
      lfcSE = deseq.results$lfcSE,
      stat = deseq.results$stat,
      pvalue = deseq.results$pvalue,
      padj = deseq.results$padj,
      cancer_type = project,
      gene_symbol = genes[rownames(deseq.results), "gene"]
    )
  }

  deseq.bbl.data.combined <- bind_rows(deseq.bbl.data)
  deseq.bbl.data.combined <- dplyr::filter(deseq.bbl.data.combined, abs(log2FoldChange) >= 1.5 & padj < 0.05)

  return(deseq.bbl.data.combined)
}
```

```{r}
plot_dgea <- function(deseq.bbl.data.combined) {
  sizes <- c("<10^-15" = 4, "10^-10" = 3, "10^-5" = 2, "0.05" = 1)

  deseq.bbl.data.combined <- deseq.bbl.data.combined %>%
    mutate(fdr_category = cut(padj,
      breaks = c(-Inf, 1e-15, 1e-10, 1e-5, 0.05),
      labels = c("<10^-15", "10^-10", "10^-5", "0.05"),
      right = FALSE
    ))

  top_genes <- deseq.bbl.data.combined %>%
    group_by(cancer_type) %>%
    mutate(rank = rank(-abs(log2FoldChange))) %>%
    dplyr::filter(rank <= 10) %>%
    ungroup()

  ggplot(top_genes, aes(y = cancer_type, x = gene_symbol, size = fdr_category, fill = log2FoldChange)) +
    geom_point(alpha = 0.5, shape = 21, color = "black") +
    scale_size_manual(values = sizes) +
    scale_fill_gradient2(low = "blue", mid = "white", high = "red", limits = c(min(deseq.bbl.data.combined$log2FoldChange), max(deseq.bbl.data.combined$log2FoldChange))) +
    theme_minimal() +
    theme(
      axis.text.x = element_text(size = 9, angle = 90, hjust = 1)
    ) +
    theme(legend.position = "bottom") +
    theme(legend.position = "bottom") +
    labs(size = "Adjusted p-value", fill = "log2 FC", y = "Cancer type", x = "Gene")
}
```

```{r}
perform_vsd <- function(genes) {
  tcga_rcd <- list()

  for (project in projects) {
    rownames(genes) <- genes$gene_id
    tcga_rcd[[project]] <- tcga_matrix[[project]][rownames(tcga_matrix[[project]]) %in% genes$gene_id, ]
    tcga_rcd[[project]] <- tcga_rcd[[project]][, rownames(samples[[project]])]
  }

  vsd_rcd <- list()

  for (project in projects) {
    print(project)
    print("=============")
    dds <- DESeqDataSetFromMatrix(
      countData = tcga_rcd[[project]],
      colData = samples[[project]],
      design = ~type
    )
    dds <- filter_genes(dds, min_count = 10)

    # Perform variance stabilization
    dds <- estimateSizeFactors(dds)
    nsub <- sum(rowMeans(counts(dds, normalized = TRUE)) > 10)
    vsd <- vst(dds, nsub = nsub)
    vsd_rcd[[project]] <- assay(vsd)
  }

  return(vsd_rcd)
}
```


#### Pyroptosis

Fetch the gene set of interest.

```{r}
genes <- read.csv(paste0(RCDdb, "Pyroptosis.csv"))
print(genes)
genes$gene_id <- cleanid(genes$gene_id)
genes <- distinct(genes, gene_id, .keep_all = TRUE)
genes <- subset(genes, gene_id != "")
genes
```

Filter the genes to include only those in the gene set of interest, and then perform differential gene expression analysis.

```{r}
deseq.bbl.data.combined <- filter_gene_set_and_perform_dgea(genes)
deseq.bbl.data.combined
```

Plot the results.

```{r}
plot_dgea(deseq.bbl.data.combined)
```
Perform variance-stabilizing transformation for further downstream analysis (i.e., for survival analysis).

```{r, warning=FALSE}
vsd <- perform_vsd(genes)
```

## V. Downloading the clinical data

Download clinical data from TCGA, and perform some preprocessing:
- The `deceased` column should be `FALSE` if the patient is alive and `TRUE` otherwise
- The `overall_survival` column should reflect the follow-up time if the patient is alive and the days to death otherwise

```{r}
download_clinical_data <- function(project) {
  clinical_data <- GDCquery_clinic(project)
  clinical_data$deceased <- ifelse(clinical_data$vital_status == "Alive", FALSE, TRUE)
  clinical_data$overall_survival <- ifelse(clinical_data$vital_status == "Alive",
    clinical_data$days_to_last_follow_up,
    clinical_data$days_to_death
  )

  return(clinical_data)
}
```

```{r}
tcga_clinical <- list()
for (project in projects) {
  tcga_clinical[[project]] <- download_clinical_data(project)
}
```

## VI. Performing survival analysis

Write utility functions for performing survival analysis.


```{r}
construct_gene_df <- function(gene_of_interest, project) {
  gene_df <- vsd[[project]] %>%
    as.data.frame() %>%
    rownames_to_column(var = "gene_id") %>%
    gather(key = "case_id", value = "counts", -gene_id) %>%
    left_join(., genes, by = "gene_id") %>%
    dplyr::filter(gene == gene_of_interest) %>%
    dplyr::filter(case_id %in% rownames(samples[[project]] %>% dplyr::filter(type == "tumor")))

  q1 <- quantile(gene_df$counts, probs = 0.25)
  q3 <- quantile(gene_df$counts, probs = 0.75)
  gene_df$strata <- ifelse(gene_df$counts >= q3, "HIGH", ifelse(gene_df$counts <= q1, "LOW", "MIDDLE"))
  gene_df <- gene_df %>% dplyr::filter(strata %in% c("LOW", "HIGH"))
  gene_df$case_id <- paste0(sapply(strsplit(as.character(gene_df$case_id), "-"), `[`, 1), '-',
                          sapply(strsplit(as.character(gene_df$case_id), "-"), `[`, 2), '-', 
                          sapply(strsplit(as.character(gene_df$case_id), "-"), `[`, 3))
  gene_df <- merge(gene_df, tcga_clinical[[project]], by.x = "case_id", by.y = "submitter_id")
  
  return(gene_df)
}
```

```{r}
compute_surival_fit <- function(gene_df) {
  return (survfit(Surv(overall_survival, deceased) ~ strata, data = gene_df))
}
```

```{r}
compute_cox <- function(gene_df) {
  return (coxph(Surv(overall_survival, deceased) ~ strata, data=gene_df))
}
```

```{r}
plot_survival <- function(fit) {
  return(ggsurvplot(fit,
    data = gene_df,
    pval = T,
    risk.table = T,
    risk.table.height = 0.3
  ))
}
```

```{r}
compute_survival_diff <- function(gene_df) {
  return(survdiff(Surv(overall_survival, deceased) ~ strata, data = gene_df))
}
```

Perform survival analysis by testing for the difference in the Kaplan-Meier curves using the G-rho family of Harrington and Fleming tests: https://rdrr.io/cran/survival/man/survdiff.html

Our genes of interest are GSDMD (the primary executor of pyroptosis) and the differentially expressed genes.

```{r}
significant_projects <- c()
significant_genes <- c()

ctr <- 1
for (project in projects) {
  for (gene in c("GSDMD", genes$gene)) {
    cat(project, gene, "\n\n")
    error <- tryCatch (
      {
        gene_df <- construct_gene_df(gene, project)
      },
      error = function(e) {
        cat("\n\n============================\n\n")
        e
      }
    )
    
    if(inherits(error, "error")) next

    if (nrow(gene_df) > 0) {
      fit <- compute_surival_fit(gene_df)
      tryCatch (
        {
          survival <- compute_survival_diff(gene_df)
          cox <- compute_cox(gene_df)
          print(ctr)
          ctr <- ctr + 1
          print(survival)
          cat("\n")
          print(cox)
          print(plot_survival(fit))
          if (pchisq(survival$chisq, length(survival$n)-1, lower.tail = FALSE) < 0.05) {
            significant_projects <- c(significant_projects, project)
            significant_genes <- c(significant_genes, gene)
          }
        },
        error = function(e) {
        }
      )
      
    }
    
    cat("\n\n============================\n\n")
  }
}
```

Display the results only for genes where a significant difference in survival has been reported.

```{r}
significant_genes
```

```{r}
num_significant_genes <- length(significant_genes)

if (num_significant_genes > 0) {
  for (i in 1 : num_significant_genes) {
    project <- significant_projects[[i]]
    gene <- significant_genes[[i]]
    
    cat(project, gene, "\n\n")
    gene_df <- construct_gene_df(gene, project)
    
    fit <- compute_surival_fit(gene_df)
    survival <- compute_survival_diff(gene_df)
    cox <- compute_cox(gene_df)
    print(survival)
    cat("\n")
    print(cox)
    print(plot_survival(fit))
    
    cat("\n\n============================\n\n")
  } 
}
```